In [6]:
import pandas as pd
import numpy as np
terror = pd.read_csv('file.csv', encoding='ISO-8859-1')
cleanedforuse = terror.filter(['imonth', 'iday', 'region','property','propextent','attacktype1','weaptype1','nperps','success','multiple','specificity'])
final = cleanedforuse[~np.isnan(cleanedforuse).any(axis=1)]
In [7]:
final.head()
Out[7]:
In [8]:
import sqlite3
conn = sqlite3.connect('Terrorisks.db')
In [9]:
final.to_sql('final',con=conn, flavor='sqlite', if_exists='replace')
In [10]:
df = pd.read_sql_query('SELECT * FROM final', conn)
In [11]:
df.head(10)
Out[11]:
In [12]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import roc_curve, auc
In [14]:
y, X = dmatrices('success ~ C(imonth) + C(iday) + region + C(property) + C(propextent) + C(attacktype1) + C(weaptype1)+ C(nperps) + specificity', df, return_type="dataframe")
print(X)
In [24]:
y = np.ravel(y)
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y)
# what percentage had multiple?
print("Benchmark:")
b = y.mean()
print(b)
# check the accuracy on the training set
a = model.score(X, y)
print("Score:")
print(a)
model.coef_
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)
# predict class labels for the test set
predicted = model2.predict(X_test)
print (predicted)
# generate class probabilities
probs = model2.predict_proba(X_test)
print (probs)
# generate evaluation metrics
print (metrics.accuracy_score(y_test, predicted))
print (metrics.roc_auc_score(y_test, probs[:, 1]))
print (metrics.confusion_matrix(y_test, predicted))
print (metrics.classification_report(y_test, predicted))
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print (scores)
print (scores.mean())
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predicted)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC = %0.4f'% roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [ ]:
In [25]:
y, X = dmatrices('multiple ~ C(imonth) + C(iday) + region + C(property) + C(propextent) + C(attacktype1) + C(weaptype1)+ C(nperps) + specificity', df, return_type="dataframe")
In [26]:
y = np.ravel(y)
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y)
# what percentage had multiple?
print("Benchmark:")
b = y.mean()
print(b)
# check the accuracy on the training set
a = model.score(X, y)
print("Score:")
print(a)
model.coef_
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)
# predict class labels for the test set
predicted = model2.predict(X_test)
print (predicted)
# generate class probabilities
probs = model2.predict_proba(X_test)
print (probs)
# generate evaluation metrics
print (metrics.accuracy_score(y_test, predicted))
print (metrics.roc_auc_score(y_test, probs[:, 1]))
print (metrics.confusion_matrix(y_test, predicted))
print (metrics.classification_report(y_test, predicted))
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print (scores)
print (scores.mean())
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predicted)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC = %0.4f'% roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [31]:
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
In [56]:
y = df['multiple']
In [57]:
X = df.filter(['imonth', 'iday', 'region','property',
'propextent','attacktype1','weaptype1','nperps','specificity'])
Xone= pd.get_dummies(X, prefix='month', columns=['imonth'])
Xtwo= pd.get_dummies(Xone, prefix='day', columns=['iday'])
Xthree= pd.get_dummies(Xtwo, prefix='region', columns=['region'])
Xfour= pd.get_dummies(Xthree, prefix='attacktype', columns=['attacktype1'])
Xfive= pd.get_dummies(Xfour, prefix='weapontype', columns=['weaptype1'])
Xsix= pd.get_dummies(Xfive, prefix='specificity', columns=['specificity'])
In [ ]:
In [ ]:
In [58]:
features_train, features_test,target_train, target_test = train_test_split(Xsix,y, test_size = 0.2,random_state=0)
In [59]:
print("Benchmark: " )
print(1-(y.mean()))
In [60]:
#Random Forest
forest=RandomForestClassifier(n_estimators=10)
forest = forest.fit( features_train, target_train)
output = forest.predict(features_test).astype(int)
forest.score(features_train, target_train )
Out[60]:
In [49]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(target_test, output)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC = %0.4f'% roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [50]:
scores = cross_val_score(forest, X, y, scoring='accuracy', cv=10)
print (scores)
print (scores.mean())
In [ ]:
In [ ]:
In [43]:
y = df['success']
X = df.filter(['imonth', 'iday', 'region','property',
'propextent','attacktype1','weaptype1','nperps','specificity'])
features_train, features_test,target_train, target_test = train_test_split(X,y, test_size = 0.2,random_state=0)
#Random Forest
forest=RandomForestClassifier(n_estimators=10)
forest = forest.fit( features_train, target_train)
output = forest.predict(features_test).astype(int)
score = forest.score(features_train, target_train)
print("Benchmark: " )
print((y.mean()))
print('Our Accuracy:')
print(score)
false_positive_rate, true_positive_rate, thresholds = roc_curve(target_test, output)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC = %0.4f'% roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [53]:
from sklearn.tree import _tree
def leaf_depths(tree, node_id = 0):
'''
tree.children_left and tree.children_right store ids
of left and right chidren of a given node
'''
left_child = tree.children_left[node_id]
right_child = tree.children_right[node_id]
'''
If a given node is terminal,
both left and right children are set to _tree.TREE_LEAF
'''
if left_child == _tree.TREE_LEAF:
'''
Set depth of terminal nodes to 0
'''
depths = np.array([0])
else:
'''
Get depths of left and right children and
increment them by 1
'''
left_depths = leaf_depths(tree, left_child) + 1
right_depths = leaf_depths(tree, right_child) + 1
depths = np.append(left_depths, right_depths)
return depths
def leaf_samples(tree, node_id = 0):
left_child = tree.children_left[node_id]
right_child = tree.children_right[node_id]
if left_child == _tree.TREE_LEAF:
samples = np.array([tree.n_node_samples[node_id]])
else:
left_samples = leaf_samples(tree, left_child)
right_samples = leaf_samples(tree, right_child)
samples = np.append(left_samples, right_samples)
return samples
def draw_tree(ensemble, tree_id=0):
plt.figure(figsize=(8,8))
plt.subplot(211)
tree = ensemble.estimators_[tree_id].tree_
depths = leaf_depths(tree)
plt.hist(depths, histtype='step', color='#9933ff',
bins=range(min(depths), max(depths)+1))
plt.xlabel("Depth of leaf nodes (tree %s)" % tree_id)
plt.subplot(212)
samples = leaf_samples(tree)
plt.hist(samples, histtype='step', color='#3399ff',
bins=range(min(samples), max(samples)+1))
plt.xlabel("Number of samples in leaf nodes (tree %s)" % tree_id)
plt.show()
def draw_ensemble(ensemble):
plt.figure(figsize=(8,8))
plt.subplot(211)
depths_all = np.array([], dtype=int)
for x in ensemble.estimators_:
tree = x.tree_
depths = leaf_depths(tree)
depths_all = np.append(depths_all, depths)
plt.hist(depths, histtype='step', color='#ddaaff',
bins=range(min(depths), max(depths)+1))
plt.hist(depths_all, histtype='step', color='#9933ff',
bins=range(min(depths_all), max(depths_all)+1),
weights=np.ones(len(depths_all))/len(ensemble.estimators_),
linewidth=2)
plt.xlabel("Depth of leaf nodes")
samples_all = np.array([], dtype=int)
plt.subplot(212)
for x in ensemble.estimators_:
tree = x.tree_
samples = leaf_samples(tree)
samples_all = np.append(samples_all, samples)
plt.hist(samples, histtype='step', color='#aaddff',
bins=range(min(samples), max(samples)+1))
plt.hist(samples_all, histtype='step', color='#3399ff',
bins=range(min(samples_all), max(samples_all)+1),
weights=np.ones(len(samples_all))/len(ensemble.estimators_),
linewidth=2)
plt.xlabel("Number of samples in leaf nodes")
plt.show()
In [61]:
draw_tree(forest)
In [62]:
draw_ensemble(forest)
In [64]:
y = df['multiple']
X = df.filter(['imonth', 'iday', 'region','property',
'propextent','attacktype1','weaptype1','nperps','specificity'])
features_train, features_test,target_train, target_test = train_test_split(X,y, test_size = 0.2,random_state=0)
#Random Forest
forest=RandomForestClassifier(n_estimators=10, max_depth = 16)
forest = forest.fit( features_train, target_train)
output = forest.predict(features_test).astype(int)
score = forest.score(features_train, target_train)
print("Benchmark: " )
print(1-(y.mean()))
print('Our Accuracy:')
print(score)
false_positive_rate, true_positive_rate, thresholds = roc_curve(target_test, output)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC = %0.4f'% roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [ ]:
import pandas as pd
In [ ]:
df = pd.read_csv('/Users/Laishumin/Datasets/globalterrorism.csv', encoding='ISO-8859-1',low_memory=False)
clean=df[['iyear','imonth','iday','region','specificity'
,'vicinity','crit1','crit2','crit3','doubtterr','multiple','success','suicide'
,'attacktype1','ingroup','guncertain1','weaptype1']]
In [ ]:
df_dummies1= pd.get_dummies(clean, prefix='month', columns=['imonth'])
In [ ]:
df_dummies2= pd.get_dummies(df_dummies1, prefix='region', columns=['region'])
In [ ]:
df_dummies3= pd.get_dummies(df_dummies2, prefix='specificity', columns=['specificity'])
In [ ]:
df_dummies4= pd.get_dummies(df_dummies3, prefix='attack_type', columns=['attacktype1'])
In [ ]:
df_dummies5= pd.get_dummies(df_dummies4, prefix='main_weapon_type', columns=['weaptype1'])
In [ ]:
data = df_dummies5
del data['iyear']
del data['iday']
del data['guncertain1']
del data['ingroup']
del data['doubtterr']
In [ ]:
names = list(data.columns.values)
names
In [ ]:
lift_multiple = []
for i in names:
num_Feature = 0
Count = 0
for sample in data1[i]:
thing = data1[i].astype(str).str.contains('1')
if (thing.iloc[Count] == True):
num_Feature += 1
Count +=1
else:
Count +=1
print("{0} ".format(num_Feature) + " from " + i)
rule_valid = 0
rule_invalid = 0
for j in range(len(data1)):
if data1.iloc[j][i] == 1:
if data1.iloc[j].multiple == 1:
rule_valid += 1
else:
rule_invalid += 1
print("{0} cases of the rule being valid were discovered".format(rule_valid))
print("{0} cases of the rule being invalid were discovered".format(rule_invalid))
# Now we have all the information needed to compute Support and Confidence
support = rule_valid # The Support is the number of times the rule is discovered.
if (num_Feature == 0):
lift_multiple.append(0)
else:
confidence = (rule_valid) / (num_Feature)
lift = confidence / 0.13
lift_multiple.append(lift)
print(i + '-->Multiple')
print("The support is {0}, the confidence is {1:.3f}, and the lift is {2:.3f}.".format(support, confidence, lift))
print("As a percentage, the confidence is {0:.1f}%.".format(100 * confidence))
print("-----------------------------------------------------------------")
In [ ]:
lift_multiple_pd = pd.DataFrame(
{'Lift':lift_multiple
},index=names2)
lift_multiple_pd
In [ ]:
graph = lift_multiple_pd.sort(['Lift'], ascending=[0])
graph
In [ ]:
%matplotlib inline
graph.plot(kind='bar')
In [8]:
import numpy as np
import seaborn as sns
import pandas as pd
sns.violinplot(x="weaptype1", y="success", data=df, palette="Set3")
Out[8]:
In [9]:
sns.violinplot(x="propextent", y="multiple", data=df, palette="Set3")
Out[9]:
In [10]:
sns.violinplot(x="imonth", y="multiple", data=df, palette="Set3")
Out[10]:
In [11]:
sns.violinplot(x="property", y="multiple", data=df, palette="Set3")
Out[11]:
In [ ]:
In [ ]: